This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
library(lme4)
library(ggplot2)
library(plotrix)
library(visreg)
library(stringr)
library(dplyr)
library(ggpubr)
library(gridExtra)
lasso_output_data <- data.frame(read.csv("/Users/noa/Workspace/lasso_positions_sampling_results/ls_c.csv",header = TRUE, na.strings = ""))
data_source = data.frame(read.csv("/Users/noa/Workspace/data/sampled_datasets.csv",header = TRUE, na.strings = ""))
lasso_output_data$relative_path = str_replace_all(lasso_output_data$dataset_id,'(/groups/pupko/noaeker/data/ABC_DR/)|(/ref_msa.aa.phy)',"")
data_only_lasso= merge(lasso_output_data,data_source,by.x="relative_path",by.y="path")
if (nrow(data_only_lasso)<nrow(lasso_output_data))
{print("Problem in matching path names")
cat("nrow data=",nrow(data), "nrow lasso=",nrow(data_only_lasso))
}
#spr_new<-merge(good_datasets, spr_new, by = "dataset_id")
#test<-spr_new %>% group_by(dataset_id) %>% count(dataset_id)
print(summary(data_only_lasso$mad))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0830 0.1390 0.1830 0.1894 0.2230 0.3480
print(summary(data_only_lasso$divergence))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.3357 0.9365 1.4852 1.6892 2.2875 5.3054
print(summary(data_only_lasso$lasso_test_mse))
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.7 63.4 279.3 3630.7 1516.7 590641.5
General statistics on the data
file_name_and_source_lasso=data_only_lasso[!duplicated(data_only_lasso[ , c("db","dataset_id")]),]
table(file_name_and_source_lasso$db)
Selectome
54
print(summary(data_only_lasso$n_loci))
Min. 1st Qu. Median Mean 3rd Qu. Max.
977 1884 2368 2424 2815 4593
Good functions
plot.hist<-function(x,main,xlab, title_size=13, text_size=12)
{
qplot(x,
geom="histogram",
main = main,
xlab = xlab,
fill=I("blue"),
col=I("red"),
alpha=I(.2),
#xlim=c(20,50)
) + theme(plot.title = element_text(hjust = 0.5,size=title_size)) +theme(text = element_text(size = text_size))
}
lmp <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
simple.regression<-function(x,y,main,xlab, ylab,cex=1.0,resid_plot=FALSE)
{linearMod <- lm((y) ~ x)
print(main)
print(summary(linearMod))
r.squared <- summary(linearMod)$r.squared
p.val <- lmp(linearMod)
if (resid_plot==TRUE)
{plot(fitted(linearMod),resid(linearMod), main =main,xlab=xlab)
abline(0, 0)
}
return(c(r.squared,p.val))
}
Lasso example using optimized branch lengths and chosen_size random trees
example_MSA_data_lasso = data_only_lasso%>%filter(job_id==10)
print(example_MSA_data_lasso)
training_optimized_chosen_size_example_data =
data.frame(read.csv("/Users/noa/Workspace/lasso_positions_sampling_results/training_sitelh_df_prediction.csv",header = TRUE, na.strings = ""))
names(training_optimized_chosen_size_example_data)[2:3]=c("training_true_val","training_predicted_val")
test_optimized_chosen_size_example_data =
data.frame(read.csv("/Users/noa/Workspace/lasso_positions_sampling_results/test_sitelh_df_prediction.csv",header = TRUE, na.strings = ""))
names(test_optimized_chosen_size_example_data)[2:3]=c("test_true_val","test_predicted_val")
chosen_size=800
example_MSA_data_lasso_chosen_size= example_MSA_data_lasso%>% filter(actucal_training_size==chosen_size)
example_MSA_data_lasso_chosen_size_optimized = example_MSA_data_lasso%>% filter(actucal_training_size==chosen_size, brlen_generator=="optimized")
print(dplyr::select(example_MSA_data_lasso_chosen_size,"dataset_id","brlen_generator","job_id","curr_msa_version_folder","n_seq", "db","number_loci_chosen","n_loci","alpha","lasso_training_R.2", "lasso_test_R.2","sample_pct", "lasso_training_spearmanr", "lasso_test_spearmanr"))
print(dplyr::select(example_MSA_data_lasso_chosen_size_optimized,"dataset_id","job_id","curr_msa_version_folder","n_seq", "db","number_loci_chosen","n_loci","alpha","lasso_training_R.2", "lasso_test_R.2","sample_pct", "lasso_training_spearmanr", "lasso_test_spearmanr"))
training_optimized_chosen_size_example_data["training delta ll"] =training_optimized_chosen_size_example_data["training_true_val"]-training_optimized_chosen_size_example_data["training_predicted_val"]
test_optimized_chosen_size_example_data["test delta ll"] =test_optimized_chosen_size_example_data["test_true_val"]-test_optimized_chosen_size_example_data["test_predicted_val"]
print(training_optimized_chosen_size_example_data[,c("training_predicted_val","training_true_val","training delta ll")])
print(test_optimized_chosen_size_example_data[,c("test_predicted_val","test_true_val","test delta ll")])
p0<-simple.regression(training_optimized_chosen_size_example_data$training_true_val,training_optimized_chosen_size_example_data$training_predicted_val,"Training set predicted log likelihood vs true log likelihood","Training set true log likelihood","Training set predicted log likelihood")
[1] "Training set predicted log likelihood vs true log likelihood"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-10.5921 -1.6414 -0.0004 1.4776 9.9491
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.285e+01 1.114e+01 -2.95 0.00327 **
x 9.984e-01 5.409e-04 1845.85 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.901 on 798 degrees of freedom
Multiple R-squared: 0.9998, Adjusted R-squared: 0.9998
F-statistic: 3.407e+06 on 1 and 798 DF, p-value: < 2.2e-16
p1<-simple.regression(test_optimized_chosen_size_example_data$test_true_val,test_optimized_chosen_size_example_data$test_predicted_val,"Test set predicted log likelihood vs true log likelihood","Test set true log likelihood","Test set predicted log likelihood",cex=1)
[1] "Test set predicted log likelihood vs true log likelihood"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-10.7685 -2.1954 0.0862 1.8887 15.1095
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -54.099352 36.540325 -1.481 0.142
x 0.997388 0.001773 562.489 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.41 on 98 degrees of freedom
Multiple R-squared: 0.9997, Adjusted R-squared: 0.9997
F-statistic: 3.164e+05 on 1 and 98 DF, p-value: < 2.2e-16
g0<-ggplot(training_optimized_chosen_size_example_data,aes(training_true_val, training_predicted_val)) +
geom_smooth(method='lm')+xlab("true log likelihood")+ylab("predicted log likelihood ")+geom_point()+ labs(title="Training set predicted log likelihood vs true log likelihood")+theme(plot.title = element_text(hjust = 0.5)) +annotate("text", x=-20300, y=-19800, label= sprintf("R^2 = %.4f",p0[1]),size=4) +annotate("text", x=-20300, y=-19950, label= "p value < 2e-324",size=4)
g1<-ggplot(test_optimized_chosen_size_example_data,aes(test_true_val, test_predicted_val)) +
geom_smooth(method='lm')+xlab("true log likelihood")+ylab("predicted log likelihood ")+geom_point()+ labs(title="Test set predicted log likelihood vs true log likelihood")+theme(plot.title = element_text(hjust = 0.5)) +annotate("text", x=-20300, y=-19800, label= sprintf("R^2 = %.4f",p1[1])) +annotate("text", x=-20300, y=-19950, label= sprintf("p value = %.2e",p1[2]))
fig<-ggarrange(g0, g1, hjust=-1,vjust=1, heights=c(2,2),common.legend=TRUE,
labels = c("A", "B"),
ncol = 1, nrow =2)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
fig
NA
NA
NA
NA
boxplot(test_optimized_chosen_size_example_data["test delta ll"])
summary(test_optimized_chosen_size_example_data["test delta ll"])
test delta ll
Min. :-15.00067
1st Qu.: -1.58644
Median : 0.06853
Mean : 0.28275
3rd Qu.: 2.14509
Max. : 11.48662
print(test_optimized_chosen_size_example_data)
NA
Lasso problematic datasets
print( data_only_lasso[data_only_lasso$lasso_test_R.2<0.9,c("brlen_generator","actucal_training_size","n_seq","n_loci","sample_pct","lasso_test_R.2", "lasso_training_R.2", "lasso_training_spearmanr","number_loci_chosen")])
NA
Example MSA Lasso statistics
library(ggpubr)
dplyr::select(example_MSA_data_lasso,actucal_training_size,brlen_generator,lasso_test_R.2,lasso_test_spearmanr,sample_pct,number_loci_chosen)
dplyr::select(example_MSA_data_lasso%>%filter(actucal_training_size==chosen_size),actucal_training_size,brlen_generator,lasso_test_R.2,lasso_test_spearmanr,sample_pct,number_loci_chosen)
g0<-ggplot(example_MSA_data_lasso, aes(x=as.factor(actucal_training_size),fill=brlen_generator, y=lasso_test_R.2))+
geom_bar(stat="identity",position=position_dodge()) +
xlab("Training size")+ylab("R squared ")+theme(axis.title=element_text(size=10))+guides(fill=guide_legend(title="Branch length distribution"))+ggtitle("Test set R squared vs. training size")+ theme(plot.title = element_text(hjust = 0.5)) +coord_cartesian(ylim = c(0.55, 1))
ggplot(example_MSA_data_lasso, aes(x=as.factor(actucal_training_size),fill=brlen_generator, y=example_MSA_data_lasso$lasso_test_mse))+
geom_bar(stat="identity",position=position_dodge()) +
xlab("Training size")+ylab("MSE ")+theme(axis.title=element_text(size=10))+guides(fill=guide_legend(title="Branch length distribution"))+ggtitle("Test set R squared vs. training size")+ theme(plot.title = element_text(hjust = 0.5))
g2<-ggplot(example_MSA_data_lasso, aes(x=as.factor(actucal_training_size),fill=brlen_generator, y=sample_pct))+ labs(color = "Branch length distribution")+
geom_bar(stat="identity",position=position_dodge()) +
xlab("Training size")+ylab("% Positions")+ theme(axis.title=element_text(size=10),)+guides(fill=guide_legend(title="Branch length distribution"))+ggtitle("Percentage of positions chosen by Lasso vs. training size" )+scale_y_continuous(labels = scales::percent)+ theme(plot.title = element_text(hjust = 0.5))
ggarrange(g0, g2, hjust=-1,vjust=0, heights=c(2,2),common.legend=TRUE,
labels = c("A", "B"),
ncol = 1, nrow =2)
plot(example_MSA_data_lasso$lasso_test_mse, example_MSA_data_lasso$lasso_test_R.2)
NA
NA
NA
NA
NA
print(example_MSA_data_lasso[,c("brlen_generator","n_loci","number_loci_chosen")])
lasso_unique<-unique(dplyr::select(data_only_lasso,dataset_id,n_loci,sample_pct))
plot(lasso_unique$n_loci,lasso_unique$sample_pct )
Figure 6 - Lasso on 50 MSAs only 3200
chosen_size = 3200
data_lasso_only_chosen_size = data_only_lasso[data_only_lasso$actucal_training_size==chosen_size,]
lasso.chosen_size.summary = data_lasso_only_chosen_size %>%
group_by() %>%
summarise(across(.cols=c(lasso_test_R.2,lasso_test_spearmanr,sample_pct), .fns= list(Median=median), na.rm= TRUE),.groups = "drop")
print(lasso.chosen_size.summary)
lasso.chosen_size.summary.per.brlen = data_lasso_only_chosen_size %>%
group_by(actucal_training_size,brlen_generator ) %>%
summarise(across(.cols=c(lasso_test_R.2,lasso_test_spearmanr,sample_pct), .fns= list(Median=median), na.rm= TRUE),.groups = "drop")
print(lasso.chosen_size.summary.per.brlen)
#Figure 6
g0<-ggplot(data_lasso_only_chosen_size,aes(lasso_test_R.2,fill=brlen_generator))+
geom_histogram()+
labs(title="Lasso test set Pearson R squared")+xlab("Pearson R squared")+ labs(fill= "Branch length distribution")+
theme(plot.title = element_text(hjust = 0.5))
g2<- ggplot(data_lasso_only_chosen_size,aes(sample_pct,fill=brlen_generator))+
geom_histogram()+
labs(title="Percentage of positions chosen by Lasso")+xlab("% positions")+ labs(fill = "Branch length distribution")+
theme(plot.title = element_text(hjust = 0.5))+theme(legend.position = "none")
ggarrange(g0, g2,hjust=-1,vjust=0,common.legend=TRUE,align="h",
labels = c("A", "B"),
ncol = 1, nrow =2)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
NA
NA
Figure 4 or table2 - results on 50 MSAs
per.brlen.size = dplyr::select(data_only_lasso,actucal_training_size,brlen_generator,lasso_test_R.2,sample_pct) %>%
group_by(actucal_training_size,brlen_generator) %>%
summarise(across(.fns= list(Median=median), na.rm= TRUE),.groups = "drop")
print(per.brlen.size)
per.brlen.size.mse = dplyr::select(data_only_lasso,lasso_test_mse,actucal_training_size,brlen_generator) %>%
group_by(actucal_training_size,brlen_generator) %>%
summarise(across(.fns= list(Median=median), na.rm= TRUE),.groups = "drop")
print(per.brlen.size.mse )
optimized_mse = filter(per.brlen.size.mse, brlen_generator=="optimized")
optimized_R2=filter(per.brlen.size, brlen_generator=="optimized")
plot(optimized_mse$actucal_training_size, optimized_mse$lasso_test_mse_Median,type="b", xlab="Training size", ylab ="Median test MSE",title="Median test MSE vs. training size for optimized branch-length")
g00<-ggplot(optimized_mse,aes(x=actucal_training_size,y=lasso_test_mse_Median))+geom_line()+geom_point(data=optimized_mse,aes(x=actucal_training_size,y=lasso_test_mse_Median))+xlab("Training size")+ylab("Median test MSE")+ggtitle("Median test MSE vs. training size for optimized branch-lengths")+ theme(plot.title = element_text(hjust = 0.5),axis.text=element_text(size=11),axis.title=element_text(size=11)
)
g01<-ggplot(optimized_R2e,aes(x=actucal_training_size,y=lasso_test_R.2_Median))+geom_line()+geom_point(data=optimized_R2,aes(x=actucal_training_size,y=lasso_test_R.2_Median))+xlab("Training size")+ylab("Median test R2")+ggtitle("Median test MSE vs. training size for optimized branch-lengths")+ theme(plot.title = element_text(hjust = 0.5),axis.text=element_text(size=11),axis.title=element_text(size=11)
#ggarrange(g00, g01,g02,hjust=-1,vjust=0.5,
# labels = c("A", "B","C"),
# ncol = 1, nrow =3)
g0<-ggplot(data_only_lasso,aes(x=as.factor(actucal_training_size),color=brlen_generator, y=lasso_test_R.2))+
Error: unexpected symbol in:
"
g0"
Statistical analysis of test MSE vs training size and MSA metrics
library(rstatix)
library(nlme)
library(multcomp)
data_only_lasso$training_size_factor = as.factor(data_only_lasso$actucal_training_size)
data_only_lasso$n_loci.1000=data_only_lasso$n_loci/1000
data_exponential<-data_only_lasso %>% filter(brlen_generator=="exponential")
data_optimized<-data_only_lasso %>% filter(brlen_generator=="optimized")
data_uniform<-data_only_lasso %>% filter(brlen_generator=="uniform")
model<-lme(lasso_test_mse~training_size_factor+divergence+mad+gap_pct+n_loci.1000,random=~1|dataset_id, data=data_uniform)
plot(model.uniform)
qqnorm(model.uniform$residuals)
print(summary(model.uniform))
print(intervals(model.uniform,which = "fixed"))
summary(glht(model=model.uniform, linfct=mcp(training_size_factor =c("training_200 - training_100 >= 0","training_400 - training_200 >= 0","training_800 - training_400 >= 0","training_1600 - training_800 >= 0","training_3200 - training_1600 >= 0")),test = adjusted(type = "bonferroni")))
uniform
data_uniform<-data_only_lasso %>% filter(brlen_generator=="uniform")
data_uniform$training_size_factor = as.factor(paste('training_', data_uniform$training_size_factor,sep = ""))
model.uniform<-lme(lasso_test_mse~training_size_factor+divergence+mad+gap_pct+n_loci.1000,random=~1|dataset_id, data=data_uniform)
plot(model.uniform)
qqnorm(model.uniform$residuals)
print(summary(model.uniform))
Linear mixed-effects model fit by REML
Data: data_uniform
Random effects:
Formula: ~1 | dataset_id
(Intercept) Residual
StdDev: 4924.748 12903.35
Fixed effects: lasso_test_mse ~ training_size_factor + divergence + mad + gap_pct + n_loci.1000
Correlation:
(Intr) t___16 t___20 t___32 t___40 t___80 dvrgnc mad gp_pct
training_size_factortraining_1600 -0.188
training_size_factortraining_200 -0.188 0.500
training_size_factortraining_3200 -0.188 0.500 0.500
training_size_factortraining_400 -0.188 0.500 0.500 0.500
training_size_factortraining_800 -0.188 0.500 0.500 0.500 0.500
divergence -0.427 0.000 0.000 0.000 0.000 0.000
mad -0.697 0.000 0.000 0.000 0.000 0.000 0.263
gap_pct -0.091 0.000 0.000 0.000 0.000 0.000 -0.499 -0.144
n_loci.1000 -0.733 0.000 0.000 0.000 0.000 0.000 0.329 0.301 -0.147
Standardized Within-Group Residuals:
Min Q1 Med Q3 Max
-2.328850562 -0.278646376 -0.004173891 0.183514337 13.367424993
Number of Observations: 324
Number of Groups: 54
print(intervals(model.uniform,which = "fixed"))
Approximate 95% confidence intervals
Fixed effects:
lower est. upper
(Intercept) -14136.725 -1155.618 11825.488
training_size_factortraining_1600 -17462.917 -12573.505 -7684.093
training_size_factortraining_200 -13612.376 -8722.964 -3833.551
training_size_factortraining_3200 -17537.171 -12647.759 -7758.346
training_size_factortraining_400 -15989.228 -11099.816 -6210.403
training_size_factortraining_800 -17026.671 -12137.258 -7247.846
divergence 2015.629 4493.347 6971.065
mad -52021.528 -16266.467 19488.594
gap_pct -36520.822 -20831.781 -5142.740
n_loci.1000 4509.922 7428.200 10346.479
attr(,"label")
[1] "Fixed effects:"
summary(glht(model=model.uniform, linfct=mcp(training_size_factor =c("training_200 - training_100 >= 0","training_400 - training_200 >= 0","training_800 - training_400 >= 0","training_1600 - training_800 >= 0","training_3200 - training_1600 >= 0")),test = adjusted(type = "bonferroni")))
Simultaneous Tests for General Linear Hypotheses
Multiple Comparisons of Means: User-defined Contrasts
Fit: lme.formula(fixed = lasso_test_mse ~ training_size_factor + divergence +
mad + gap_pct + n_loci.1000, data = data_uniform, random = ~1 |
dataset_id)
Linear Hypotheses:
Estimate Std. Error z value Pr(<z)
training_200 - training_100 >= 0 -8722.96 2483.25 -3.513 0.00113 **
training_400 - training_200 >= 0 -2376.85 2483.25 -0.957 0.66510
training_800 - training_400 >= 0 -1037.44 2483.25 -0.418 0.95169
training_1600 - training_800 >= 0 -436.25 2483.25 -0.176 0.99144
training_3200 - training_1600 >= 0 -74.25 2483.25 -0.030 0.99803
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Adjusted p values reported -- single-step method)
plot(data_only_lasso$n_loci,data_only_lasso$lasso_test_mse**0.5)
hist(data_only_lasso[data_only_lasso$brlen_generator=="exponential",]$lasso_test_mse, breaks=100)
boxplot(data_only_lasso[data_only_lasso$brlen_generator=="exponential",]$lasso_test_mse)
print(summary(data_only_lasso[data_only_lasso$brlen_generator=="exponential" & data_only_lasso$actucal_training_size==3200,]$lasso_test_mse))
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.48 110.79 214.10 979.79 710.08 8892.41
print(summary(data_only_lasso[data_only_lasso$brlen_generator=="exponential" & data_only_lasso$actucal_training_size==3200,]$lasso_test_R.2))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.8277 0.9902 0.9979 0.9881 0.9991 0.9997
optimized
data_optimized<-data_only_lasso %>% filter(brlen_generator=="optimized")
data_optimized$training_size_factor = as.factor(paste('training_', data_optimized$training_size_factor,sep = ""))
model.optimized<-lme(lasso_test_mse~training_size_factor+divergence+mad+gap_pct+n_loci.1000,random=~1|dataset_id, data=data_optimized)
print(summary(model.optimized))
Linear mixed-effects model fit by REML
Data: data_optimized
Random effects:
Formula: ~1 | dataset_id
(Intercept) Residual
StdDev: 99.61194 147.2473
Fixed effects: lasso_test_mse ~ training_size_factor + divergence + mad + gap_pct + n_loci.1000
Correlation:
(Intr) t___16 t___20 t___32 t___40 t___80 dvrgnc mad gp_pct
training_size_factortraining_1600 -0.135
training_size_factortraining_200 -0.135 0.500
training_size_factortraining_3200 -0.135 0.500 0.500
training_size_factortraining_400 -0.135 0.500 0.500 0.500
training_size_factortraining_800 -0.135 0.500 0.500 0.500 0.500
divergence -0.433 0.000 0.000 0.000 0.000 0.000
mad -0.708 0.000 0.000 0.000 0.000 0.000 0.263
gap_pct -0.092 0.000 0.000 0.000 0.000 0.000 -0.499 -0.144
n_loci.1000 -0.744 0.000 0.000 0.000 0.000 0.000 0.329 0.301 -0.147
Standardized Within-Group Residuals:
Min Q1 Med Q3 Max
-3.093391833 -0.249261687 0.002988116 0.187539098 9.051032170
Number of Observations: 324
Number of Groups: 54
plot(model.optimized)
print(intervals(model.optimized,which = "fixed"))
Approximate 95% confidence intervals
Fixed effects:
lower est. upper
(Intercept) -65.98936 140.32821 346.64578
training_size_factortraining_1600 -291.51870 -235.72290 -179.92710
training_size_factortraining_200 -200.71387 -144.91807 -89.12227
training_size_factortraining_3200 -294.19936 -238.40356 -182.60776
training_size_factortraining_400 -257.55783 -201.76202 -145.96622
training_size_factortraining_800 -278.59913 -222.80332 -167.00752
divergence 12.76796 52.74287 92.71778
mad -1132.50329 -555.63978 21.22372
gap_pct -440.84519 -187.72200 65.40119
n_loci.1000 40.74230 87.82510 134.90790
attr(,"label")
[1] "Fixed effects:"
summary(glht(model=model.optimized, linfct=mcp(training_size_factor =c("training_200 - training_100 >= 0","training_400 - training_200 >= 0","training_800 - training_400 >= 0","training_1600 - training_800 >= 0","training_3200 - training_1600 >= 0")),test = adjusted(type = "bonferroni")))
Simultaneous Tests for General Linear Hypotheses
Multiple Comparisons of Means: User-defined Contrasts
Fit: lme.formula(fixed = lasso_test_mse ~ training_size_factor + divergence +
mad + gap_pct + n_loci.1000, data = data_optimized, random = ~1 |
dataset_id)
Linear Hypotheses:
Estimate Std. Error z value Pr(<z)
training_200 - training_100 >= 0 -144.918 28.338 -5.114 <0.001 ***
training_400 - training_200 >= 0 -56.844 28.338 -2.006 0.109
training_800 - training_400 >= 0 -21.041 28.338 -0.743 0.807
training_1600 - training_800 >= 0 -12.920 28.338 -0.456 0.941
training_3200 - training_1600 >= 0 -2.681 28.338 -0.095 0.996
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Adjusted p values reported -- single-step method)
exponential
data_exponential<-data_only_lasso %>% filter(brlen_generator=="exponential")
data_exponential$training_size_factor = as.factor(paste('training_', data_exponential$training_size_factor,sep = ""))
model.exponential<-lme(lasso_test_mse~training_size_factor+divergence+mad+gap_pct+n_loci.1000,random=~1|dataset_id, data=data_exponential)
plot(model.exponential)
print(summary(model.exponential))
Linear mixed-effects model fit by REML
Data: data_exponential
Random effects:
Formula: ~1 | dataset_id
(Intercept) Residual
StdDev: 2539.826 32238.76
Fixed effects: lasso_test_mse ~ training_size_factor + divergence + mad + gap_pct + n_loci.1000
Correlation:
(Intr) t___16 t___20 t___32 t___40 t___80 dvrgnc mad gp_pct
training_size_factortraining_1600 -0.247
training_size_factortraining_200 -0.247 0.500
training_size_factortraining_3200 -0.247 0.500 0.500
training_size_factortraining_400 -0.247 0.500 0.500 0.500
training_size_factortraining_800 -0.247 0.500 0.500 0.500 0.500
divergence -0.417 0.000 0.000 0.000 0.000 0.000
mad -0.681 0.000 0.000 0.000 0.000 0.000 0.263
gap_pct -0.089 0.000 0.000 0.000 0.000 0.000 -0.499 -0.144
n_loci.1000 -0.716 0.000 0.000 0.000 0.000 0.000 0.329 0.301 -0.147
Standardized Within-Group Residuals:
Min Q1 Med Q3 Max
-1.17729607 -0.27086453 -0.03456377 0.14784375 16.22606510
Number of Observations: 324
Number of Groups: 54
print(intervals(model.exponential,which = "fixed"))
Approximate 95% confidence intervals
Fixed effects:
lower est. upper
(Intercept) -35178.433 -10480.679 14217.076
training_size_factortraining_1600 -34253.657 -22037.558 -9821.460
training_size_factortraining_200 -28945.529 -16729.430 -4513.332
training_size_factortraining_3200 -34613.119 -22397.020 -10180.922
training_size_factortraining_400 -33505.354 -21289.255 -9073.157
training_size_factortraining_800 -33991.002 -21774.903 -9558.805
divergence 5609.752 10215.305 14820.858
mad -71953.762 -5492.677 60968.408
gap_pct -70215.844 -41053.240 -11890.637
n_loci.1000 7579.053 13003.515 18427.976
attr(,"label")
[1] "Fixed effects:"
qqnorm(model.exponential, ~ranef(., level=1))
summary(glht(model=model.exponential, linfct=mcp(training_size_factor =c("training_200 - training_100 >= 0","training_400 - training_200 >= 0","training_800 - training_400 >= 0","training_1600 - training_800 >= 0","training_3200 - training_1600 >= 0")),test = adjusted(type = "bonferroni")))
Simultaneous Tests for General Linear Hypotheses
Multiple Comparisons of Means: User-defined Contrasts
Fit: lme.formula(fixed = lasso_test_mse ~ training_size_factor + divergence +
mad + gap_pct + n_loci.1000, data = data_exponential, random = ~1 |
dataset_id)
Linear Hypotheses:
Estimate Std. Error z value Pr(<z)
training_200 - training_100 >= 0 -16729.4 6204.4 -2.696 0.0175 *
training_400 - training_200 >= 0 -4559.8 6204.4 -0.735 0.8117
training_800 - training_400 >= 0 -485.6 6204.4 -0.078 0.9967
training_1600 - training_800 >= 0 -262.7 6204.4 -0.042 0.9977
training_3200 - training_1600 >= 0 -359.5 6204.4 -0.058 0.9973
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Adjusted p values reported -- single-step method)